#!/bin/bash
export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
export CUDA_VISIBLE_DEVICES=0,1
python3 -m sglang.launch_server --model-path /data2/Qwen3-32B-FP8 --port 8888 --tensor-parallel-size 2 --host 0.0.0.0 --dtype auto --context-length 65536 --reasoning-parser qwen3 --kv-cache-dtype auto --max-running-requests 256 --enable-hierarchical-cache --hicache-size 100 --mem-fraction-static 0.9